In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import math
from sklearn.cross_validation import train_test_split,KFold
import numpy as np

In [2]:
datafile_train=r'Data/consumer/Consumer_Complaints_train.csv'
datafile_test=r'Data/consumer/Consumer_Complaints_test.csv'
cd_train=pd.read_csv(datafile_train)
cd_test=pd.read_csv(datafile_test)

In [ ]:
cd_train.dtypes

In [3]:
for col in ['Date received','Date sent to company']:
    cd_train[col]=pd.to_datetime(cd_train[col],infer_datetime_format=True)
    cd_test[col]=pd.to_datetime(cd_test[col],infer_datetime_format=True)

In [4]:
cd_train['day_diff']=pd.to_numeric(cd_train['Date sent to company']-cd_train['Date received'])
cd_test['day_diff']=pd.to_numeric(cd_test['Date sent to company']-cd_test['Date received'])

In [5]:
for col in ['Date received','Date sent to company']:
    cd_train.drop([col],1,inplace=True)
    cd_test.drop([col],1,inplace=True)

In [ ]:
for col in cd_train.select_dtypes(['object']).columns:
    print(col,':',cd_train[col].nunique())

In [ ]:
cd_train.isnull().sum()

In [ ]:
len(pd.isnull(cd_train['Tags']))
len(cd_train)

In [6]:
for col in ['Sub-product','Sub-issue','Consumer complaint narrative',
            'Company public response','Tags','Consumer consent provided?']:
    varname=col.replace('-','_').replace('?','').replace(" ",'_')+'_isNan'
    cd_train[varname]=np.where(pd.isnull(cd_train[col]),1,0)
    cd_train.drop([col],1,inplace=True)
    cd_test[varname]=np.where(pd.isnull(cd_test[col]),1,0)
    cd_test.drop([col],1,inplace=True)

In [ ]:
cd_train.head(4)

In [15]:
for col in cd_train.select_dtypes(['object']).columns:
    print(col,':',cd_train[col].nunique())

In [7]:
for col in ['ZIP code','Company']:
    cd_train.drop([col],1,inplace=True)
    cd_test.drop([col],1,inplace=True)

In [8]:
cd_train['Consumer disputed?']=np.where(cd_train['Consumer disputed?']=="Yes",1,0)

In [9]:
k=cd_train['Issue'].value_counts()
for val in k.axes[0][0:10]:
    varname='Issue_'+val.replace(',','_').replace(' ','_')
    cd_train[varname]=np.where(cd_train['Issue']==val,1,0)
    cd_test[varname]=np.where(cd_test['Issue']==val,1,0)
del cd_train['Issue']
del cd_test['Issue']

In [13]:
for col in cd_train.select_dtypes(['object']).columns:
    print(col,':',cd_train[col].nunique())

In [10]:
k=cd_train['State'].value_counts()
for val in k.axes[0][0:15]:
    varname='State_'+val.replace(',','_').replace(' ','_')
    cd_train[varname]=np.where(cd_train['State']==val,1,0)
    cd_test[varname]=np.where(cd_test['State']==val,1,0)
del cd_train['State']
del cd_test['State']

In [11]:
for col in ['Product','Submitted via','Company response to consumer','Timely response?']:
    
    temp=pd.get_dummies(cd_train[col],prefix=col,drop_first=True)
    cd_train=pd.concat([temp,cd_train],1)
    cd_train.drop([col],1,inplace=True)
    
    temp=pd.get_dummies(cd_test[col],prefix=col,drop_first=True)
    cd_test=pd.concat([temp,cd_test],1)
    cd_test.drop([col],1,inplace=True)

In [12]:
x = cd_train.drop(['Consumer disputed?','Complaint ID'],1)
y = cd_train['Consumer disputed?']

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,accuracy_score,classification_report

Optimizing model...

Run train_test splits on the train data


In [14]:
ld_train, ld_test = train_test_split(cd_train, test_size=0.2, random_state=2)

In [15]:
x80_train = ld_train.drop(['Consumer disputed?','Complaint ID'],1)
y80_train = ld_train['Consumer disputed?']

x20_test = ld_test.drop(['Consumer disputed?','Complaint ID'],1)
y20_test = ld_test['Consumer disputed?']

1. Check ROC_AUC_SCORE {penalty='l1', class_weight=None}


In [19]:
model_logr1 = LogisticRegression(penalty="l1",class_weight=None,random_state=2)

In [20]:
model_logr1.fit(x80_train, y80_train)


Out[20]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=2, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [21]:
#y20_test_pred = np.where(model_logr1.predict(ld_test.drop(['Complaint ID','Consumer disputed?'],1))==1,1,0)
y20_test_pred = np.where(model_logr1.predict(x20_test)==1,1,0)
temp_df = pd.DataFrame(list(zip(cd_test['Complaint ID'],list(y20_test_pred))), columns=['Complaint ID','Consumer disputed?'])

y_test_pred = temp_df['Consumer disputed?']

In [23]:
roc_auc_score(y20_test, y_test_pred)


Out[23]:
0.5

2. Check ROC_AUC_SCORE {penalty='l2', class_weight=None}


In [24]:
model_logrl2 = LogisticRegression(penalty="l2",class_weight=None,random_state=2)

In [25]:
model_logrl2.fit(x80_train, y80_train)


Out[25]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=2, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [26]:
y20_test_pred = np.where(model_logrl2.predict(x20_test)==1,1,0)
temp_df = pd.DataFrame(list(zip(cd_test['Complaint ID'],list(y20_test_pred))), columns=['Complaint ID','Consumer disputed?'])

y_test_pred = temp_df['Consumer disputed?']

In [27]:
roc_auc_score(y20_test, y_test_pred)


Out[27]:
0.4996866817311203

3. Check ROC_AUC_SCORE {penalty='l1', class_weight='balanced'}


In [28]:
model_logr2 = LogisticRegression(penalty="l1",class_weight="balanced",random_state=2)

In [29]:
model_logr2.fit(x80_train, y80_train)


Out[29]:
LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=2,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [30]:
y20_test_pred2 = np.where(model_logr2.predict(ld_test.drop(['Complaint ID','Consumer disputed?'],1))==1,1,0)
temp_df2 = pd.DataFrame(list(zip(cd_test['Complaint ID'],list(y20_test_pred2))),
                       columns=['Complaint ID','Consumer disputed?'])

y_test_pred2 = temp_df2['Consumer disputed?']

In [31]:
roc_auc_score(y20_test, y_test_pred2)


Out[31]:
0.5775594371000643

4. Check ROC_AUC_SCORE {penalty='l2', class_weight='balanced'}


In [32]:
model_logr3 = LogisticRegression(penalty="l2",class_weight="balanced",random_state=2)

In [33]:
model_logr3.fit(x80_train, y80_train)


Out[33]:
LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=2,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [34]:
y20_test_pred3 = np.where(model_logr3.predict(ld_test.drop(['Complaint ID','Consumer disputed?'],1))==1,1,0)
temp_df3 = pd.DataFrame(list(zip(cd_test['Complaint ID'],list(y20_test_pred3))),
                       columns=['Complaint ID','Consumer disputed?'])

y_test_pred3 = temp_df3['Consumer disputed?']

In [35]:
roc_auc_score(y20_test, y_test_pred3)


Out[35]:
0.52907311874384377

2. Optimizing Model continues...

a. Employ CV procedure


In [36]:
from sklearn import cross_validation

In [38]:
predicted = cross_validation.cross_val_predict(model_logr2, x, y, cv=10)
print(accuracy_score(y, predicted))
print(classification_report(y, predicted))


0.568003510755
             precision    recall  f1-score   support

          0       0.84      0.56      0.67    249426
          1       0.27      0.60      0.37     67315

avg / total       0.72      0.57      0.61    316741

3. Cutoff based predicted probabilities


In [55]:
prob_score=pd.Series(list(zip(*model_logr2.predict_proba(x80_train)))[1])

In [56]:
cutoffs=np.linspace(0,1,100)

For each of these cutoff , we are going to look at TP,FP,TN,FN values and caluclate KS. Then we'll choose the best cutoff as the one having highest KS.


In [58]:
KS_cut=[]
for cutoff in cutoffs:
    predicted=pd.Series([0]*len(y80_train))
    predicted[prob_score>cutoff]=1
    df=pd.DataFrame(list(zip(y80_train,predicted)),columns=["real","predicted"])
    TP=len(df[(df["real"]==1) &(df["predicted"]==1) ])
    FP=len(df[(df["real"]==0) &(df["predicted"]==1) ])
    TN=len(df[(df["real"]==0) &(df["predicted"]==0) ])
    FN=len(df[(df["real"]==1) &(df["predicted"]==0) ])
    P=TP+FN
    N=TN+FP
    KS=(TP/P)-(FP/N)
    KS_cut.append(KS)

cutoff_data=pd.DataFrame(list(zip(cutoffs,KS_cut)),columns=["cutoff","KS"])

KS_cutoff=cutoff_data[cutoff_data["KS"]==cutoff_data["KS"].max()]["cutoff"]

Now we'll see how this model with the cutoff determined here , performs on the test data.


In [60]:
# Performance on test data
prob_score_test=pd.Series(list(zip(*model_logr2.predict_proba(x20_test)))[1])

predicted_test=pd.Series([0]*len(y20_test))
predicted_test[prob_score_test > float(KS_cutoff)]=1

df_test=pd.DataFrame(list(zip(y20_test,predicted_test)),columns=["real","predicted"])

k=pd.crosstab(df_test['real'],df_test["predicted"])
print('confusion matrix :\n \n ',k)
TN=k.iloc[0,0]
TP=k.iloc[1,1]
FP=k.iloc[0,1]
FN=k.iloc[1,0]
P=TP+FN
N=TN+FP


confusion matrix :
 
  predicted      0      1
real                   
0          24178  25699
1           4349   9123

In [61]:
# Accuracy of test
(TP+TN)/(P+N)


Out[61]:
0.52567522770683039

In [62]:
# Sensitivity on test
TP/P


Out[62]:
0.67718230403800472

In [63]:
#Specificity on test
TN/N


Out[63]:
0.48475249112817531

Fit the optimized model on actual x,y and predict y from test dataset


In [39]:
model_logr2.fit(x,y)


Out[39]:
LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=2,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [40]:
prediction = np.where(model_logr2.predict(cd_test.drop(['Complaint ID'],1))==1,"Yes","No")
submission = pd.DataFrame(list(zip(cd_test['Complaint ID'],list(prediction))),
                       columns=['Complaint ID','Consumer disputed?'])

In [49]:
pred_y = submission['Consumer disputed?']
actual_y = cd_train['Consumer disputed?']
# roc_auc_score(actual_y, pred_y) # This will fail since the probability pairs are one-one between y_actual and y_predicted

In [52]:
submission.head(4)


Out[52]:
Complaint ID Consumer disputed?
0 675956 Yes
1 1858795 No
2 32637 Yes
3 1731374 No

In [53]:
submission.to_csv('submission_new.csv',index=False)

This submission will get you auc score of approx 0.50, slightly less, try to increase the score.